# Immigration and the top 1%

# how likely are people to stay at the top


# Variable for population control total
var_control <- paste("pop_tot_", '18plus', sep = "")

# ts vector
ts_vec <- c(0.1, 0.01)/100

# loop over top shares
map(ts_vec, function(ts) {
  # pick top share
  
  # loop over base years
  furrr::future_map(1997:2018, function(base_year) {
    
    # upload data
    data <-
      read_fst(paste(
        'datalab_data_folder/SA_PAYE',
        base_year,
        '.fst',
        sep = ""
      ))
    
    # find people at the top
    ntop_base <- how_many_top(base_year, ts)
    
    # keep people at the top
    data <- data[1:ntop_base,]
    
    
    # create dataset with two columns (beyond identifier)
    tt <- data %>%
      select(id, anon_utr, nino_anon, migrant_comb, tyob)
    
    # find utr
    utrlist <- data[, 'anon_utr']
    
    # remove NAs
    utrlist <- utrlist[which(!is.na(utrlist))]
    
    # find nino
    ninolist <- data[, 'nino_anon']
      
    # remove NAs
    ninolist <- ninolist[which(!is.na(ninolist))]
    
    # find id
    idlist <- data[, 'id']
    
    # remove NAs
    idlist <- idlist[which(!is.na(idlist))]    
    
    # # # #
    
    ## check whether people are at the top
    
    # years
    yy <- 2018:1997
    
    # loop
    tab <- future_map_dfr(yy, function(x) {
      # file name
      file_name <-
        paste('datalab_data_folder/SA_PAYE', x, '.fst', sep = "")
      
      # read fst
      dd <- read_fst(file_name)
      
      # people at thet top
      nmax <- how_many_top(x, ts)
      
      # subset
      dd <- dd[1:nmax,]
      
      
      # # # #
      
      # list of utr
      utr_at_top <- dd[, 'anon_utr']
      
      # eliminate NAs
      utr_at_top <- utr_at_top[which(!is.na(utr_at_top))]
      
      # list of nino
      nino_at_top <- dd[, 'nino_anon']
      
      # eliminate Nas
      nino_at_top <- nino_at_top[which(!is.na(nino_at_top))]
        
      # list of ids
      id_at_top <- dd[, 'id']
      
      # eliminate Nas
      id_at_top <- id_at_top[which(!is.na(id_at_top))]
      
      
      # # # #
      
      # check whether an individual (UTR-NINO pair) is at the top
      tab <- as_tibble(tt)
      
      # create dummy
      tab <- tab %>%
        mutate(
          ts = ts,
          tax_year = x,
          top_utr = if_else(anon_utr %in% utr_at_top, 1, 0),
          top_nino = if_else(nino_anon %in% nino_at_top, 1, 0),
          top_id = if_else(id %in% id_at_top, 1, 0),
          top_dummy = if_else(top_utr + top_id >= 1, 1, 0)
        )
      
      # eliminate intermediate dummies
      tab <- tab %>%
        select(-top_utr, -top_nino, -top_id)
      
      # return
      return(tab)
      
      
    }, .progress = T)
    
    
    # # # #
    
    ## size of top share
    size <- map_dfr(yy, function(x) {
      # size of top share
      nmax <- how_many_top(x, ts)
      
      # tibble
      tt <- tibble(tax_year = x,
                   size_top_cell = nmax)
      
    })
    
    # # # #
    
    # counts
    counts <- tab %>%
      group_by(tax_year, migrant_comb) %>%
      summarise(top = sum(top_dummy))
    
    # base count
    base_count <- tt %>%
      group_by(migrant_comb) %>%
      summarise(n = n())
    
    # merge
    counts <- counts %>%
      left_join(base_count, by = 'migrant_comb')
    
    # proportion
    counts <- counts %>%
      mutate(prop = top / n)
    
    
    # # # #
    
    ## graph
    counts %>%
      ggplot(.,
             aes(tax_year, prop, group = migrant_comb, color = migrant_comb)) +
      geom_point() +
      geom_line() +
      scale_color_manual(values = c('navy', 'maroon')) +
      scale_y_continuous(limits = c(0, 1)) +
      theme_minimal()
    
    # top share as string
    if (ts == 0.01) {
      ts_string <- '1'
    } else if (ts == 0.001) {
      ts_string <- '01'
    } else if (ts == 0.0001) {
      ts_string <- '001'
    } else {
      stop('wrong top share')
    }
    
    
    # file name graph
    file_name <-
      paste('persist', '_Top', ts_string, "_", base_year, '.pdf', sep = "")
    
    # save
    ggsave(
      filename = file_name,
      path = 'output/',
      dpi = 'retina',
      width = 16,
      height = 9,
      units = 'cm'
    )
    
    ## save underlying data
    
    # file name
    file_name <- gsub('\\.pdf', '.csv', file_name)
    file_name <- paste('output/',
                       file_name, sep = "")
    
    # write_csv
    write_csv(counts, file_name)
    
    
    # # # #
    
    ## joiners into Top1
    tab2 <- tab %>%
      filter(tax_year == base_year | tax_year == (base_year - 1))
    
    # base year
    tab2 <- tab2 %>%
      mutate(year = case_when(
        tax_year == base_year ~ 'base',
        tax_year == (base_year - 1) ~ 'minus_1',
        TRUE ~ NA_character_
      ))
    
    # long to wide
    tab2 <- tab2 %>%
      select(anon_utr, id, nino_anon, migrant_comb, tyob,  year, top_dummy) %>%
      spread(key = 'year',
             value = 'top_dummy',
             sep = "_")
    
    # numeric
    tab2 <- tab2 %>%
      mutate_at(vars(starts_with('year')), 'as.numeric')
    
    
    ## save data
    
    # file name
    file_name <-
      paste(
        'output/top',ts_string,'_base_year',base_year,'.fst',
        sep = ""
      )
    
    # save
    write_fst(tab2, file_name)
    
    # return
    return()
    
  })
  
})


# # # #

## combine csv files to prepare for output release

# list of files
file_list <- list.files('output/',
                        full.names = T)

# remove previously combined file
ind <- grep('allyears', file_list, invert = T)
file_list <- file_list[ind]

# loop over file list
tab <- future_map_dfr(file_list, function(file_name){
  
  # upload
  data <- read_csv(file_name)
  
  # remove csv
  file_name <- gsub('\\.csv', "", file_name)
  
  # create base year
  base_year <- str_extract(string = file_name, '\\d+$')
  
  # numeric
  base_year <- as.numeric(base_year)
  
  # create variable with base year
  data <- data %>%
    mutate(base_year = base_year)
  
  # top share
  ts_string <- str_extract(file_name, 'Top[:digit:]+')
  ts_string <- str_extract(ts_string, '[:digit:]+')
  
  # top share as numeric
  if (ts_string == '1') {
    ts <- .01
  } else if (ts_string == '01') {
    ts <- .001
  } else if (ts_string == '001') {
    ts <- .0001
  }
  
  # create variable
  data <- data %>%
    mutate(ts = ts)
  
  
  # return
  return(data)
  
})

# save output
write_csv(tab, 'output/persist_allyears.csv')


# now results on mobility rather than persistence: where in the distribution were people before. 
#   done in Stata, as produced while R packages weren't available in the lab temporarily
stata_code <- "
  *****************
  ***PERCENTILES***
  *****************
  cd `"datalab_data_folder/"'
  *!`"C:/Program Files (x86)/7-Zip/7z"'' e `"./sa2017.csv.gz"' -y
  import delim using sa2017.csv, clear $restrict

  keep id nino_anon anon_utr ti sex migrant_comb tyob weight
      
  destring anon_utr ti weight, force replace


  preserve 

  **PAYE only
  *!`"C:/Program Files (x86)/7-Zip/7z"'' e `"./paye2017.csv.gz"' -y
  import delim using paye2017.csv, clear $restrict

  keep id nino_anon anon_utr ti sex migrant_comb tyob weight

  destring anon_utr ti weight, force replace
      
  *Append to SA data
  tempfile paye_data
  save `paye_data', replace

  restore
  append using `paye_data', force

  local year = 2017

  gsort -ti
  g cumsum = sum(weight)

  g pct_99_`year' = (cumsum <= `popn`year'')

  keep if pct_99_2017 == 1

  save ineq_rnr_top1, replace


  *Total income for top 1%
  sum ti [w = weight]
  di r(sum)/1e+09

  *Need to remove duplicates (keep migrant observations)
  duplicates tag id, gen(dup)
  keep if dup == 0 | (dup == 1 & migrant_comb == 1)
  drop dup


  foreach year in 2007 2012 2016 {
    cap drop _merge
    
    merge 1:1 id using prev_pctile_`year', keep(match master)
  }


  tab2 distrib_2016 migrant_comb, mi
  tab2 distrib_2012 migrant_comb, mi
  tab2 distrib_2007 migrant_comb, mi

  table migrant_comb distrib_2016, c(freq sum weight) mi
  table migrant_comb distrib_2012, c(freq sum weight) mi
  table migrant_comb distrib_2007, c(freq sum weight) mi
"

system(paste("stata -e", shQuote(stata_code, type = "cmd")), intern = TRUE)
#the outputs were manually saved to mobility.csv (copy paste) as R packages weren't available in the lab during this period